#######################################################################
#####                                                            ######
#####   Input: Various dictionary files; word vectors            ######
#####   Output: Dictionaries for topic areas                     ######
#####                                                            ######
#######################################################################

# Load libraries

library(quanteda) # v3.3.1
library(quanteda.dictionaries) # [github::kbenoit/quanteda.dictionaries] v0.4
library(data.table) # v1.14.8
library(text2vec) # v0.6.3

sigmoid_new <- function(x,a,c){
  1/(1+exp(-a*(x-c)))
}
trim <- function(s) gsub("^[[:space:]]+|[[:space:]]+$","",s)

embedding_dictionary_scores <- function(dictionary_words, word_vectors){
  
  matches <- lapply(dictionary_words, function(x) {
    if(grepl("\\*", x) | x == "^[[:digit:]]"){
      x <- gsub("\\*", "", x)
      grep(paste0("^",x), rownames(word_vectors))
      
    }else{
      which(rownames(word_vectors) == x)
    }
  }
  )  
  
  matches <- unique(unlist(matches))
  
  target_vector <- word_vectors[c(matches),,drop = FALSE ] 
  
  # Create mean word-embedding for dictionary
  target_vector <- t(as.matrix(apply(target_vector, 2, mean))) 
  
  # Distance between each word in the vocabulary and the mean embedding
  cos_sim <- sim2(target_vector, word_vectors, method = "cosine", norm = "l2") 
  
  word_scores <- data.table(word = dimnames(cos_sim)[[2]], 
                            score = cos_sim[1,], 
                            in_original_dictionary = dimnames(cos_sim)[[2]]%in%rownames(word_vectors)[matches])
  
  word_scores <- word_scores[order(word_scores$score, decreasing = T),]
  
  word_scores$sigmoid <- sigmoid_new(word_scores$score, 40, .35)
  
  return(word_scores)
  
}

# Convenience function to take a liwc-style dictionary and return all word matches

expand_dictionary <- function(original_words){
  
  matches <- lapply(original_words, function(x) {
    if(grepl("\\*", x) | x == "^[[:digit:]]"){
      x <- gsub("\\*", "", x)
      grep(paste0("^",x), rownames(word_vectors))
      
    }else{
      which(rownames(word_vectors) == x)
    }
  }
  
  ) 
  
  return(sort(unique(rownames(word_vectors)[unlist(matches)])))
  
}

## Source dictionaries

keywords <- read.csv("data/dictionaries/seed_words.csv", stringsAsFactors = FALSE)

defence_words <- keywords$Defence[1:47]
economy_words <- keywords$FinanceEconomy[1:50]
agriculture_words <- keywords$Agriculture[1:39]
health_words <- keywords$Health[1:46]
children_words <- keywords$ChildrenFamily[1:34]
education_words <- keywords$Education[1:48]
social_words <- keywords$SocialWelfare[1:53]
trade_words <- keywords$ForeignTrade[1:46] 
environment_words <- keywords$Environment[1:50]
energy_words <- keywords$Energy[1:36]
crime_words <- keywords$CrimePolicing[1:36]
transport_words <- keywords$Transportation[1:34]

## Load word vectors

load("working/word_vectors_150.Rdata")

## Drop parliamentary jargon vectors

word_vectors <- word_vectors[!grepl("parljargon|parlboilerplate", rownames(word_vectors)),]

## Define seed dictionaries

dictionary_to_use <- dictionary(list(defence = expand_dictionary(defence_words),
                                     economy = expand_dictionary(economy_words), 
                                     agriculture = expand_dictionary(agriculture_words), 
                                     health = expand_dictionary(health_words), 
                                     children = expand_dictionary(children_words), 
                                     education = expand_dictionary(education_words),
                                     social = expand_dictionary(social_words), 
                                     trade = expand_dictionary(trade_words),
                                     environment = expand_dictionary(environment_words), 
                                     crime = expand_dictionary(crime_words), 
                                     transport = expand_dictionary(transport_words)))

word_scores <- lapply(names(dictionary_to_use), 
                      function(x) 
                        embedding_dictionary_scores(dictionary_to_use[[x]], word_vectors = word_vectors))

names(word_scores) <- names(dictionary_to_use)

save(word_scores, dictionary_to_use, file = "working/dictionaries_issues.Rdata")
